---
title: "Cluster Mate Final"
author: "Kapitany-Föveny &  Sulyok"
date: '2020 április 10 '
output: word_document
---

```{r echo=TRUE}
library(outbreaks)
## Warning: package 'outbreaks' was built under R version 3.6.3
library(ggplot2)
library(tidyverse)
library(incidence)
## Warning: package 'incidence' was built under R version 3.6.3
library(tidyr)
library(EpiEstim)
## Warning: package 'EpiEstim' was built under R version 3.6.3
library(data.table)

library(ggpmisc)
## Warning: package 'ggpmisc' was built under R version 3.6.3
### Obtaining the case numbers
CaseData <- read_csv("CaseData2703.csv")

sum<-CaseData %>%
  group_by(Country.Region) %>%
  summarise_each(list(max=max))
a<-sum$Country.Region[c(which(sum$CumCaseNumber_max<=100))]
sum2<-CaseData %>%
  filter(!Country.Region %in% a) 


length(levels(as.factor(sum2$Country.Region)))

DataLong <- melt(sum2[ , c( "Country.Region", "date", "IncCaseNumber" )], id.vars = 1:2)
DataLong$value<-ifelse(DataLong$value <0, 0, DataLong$value)
DataLong<-na.omit(DataLong)
dat<-uncount(DataLong, value)
dat<-dat[,1:2]
#calculate doubling time

doublingtime<-function(x) {
  
  it <- subset(dat, Country.Region==x)
  it<-it[,2]
  i.1 <- incidence(it, interval = 1)
  fit<-fit(i.1)
  print(x)
  print(fit$info$doubling)
}

dtime<-sapply(levels(as.factor(sum2$Country.Region)), doublingtime)

length(dtime)

dtimes<-data.frame(unlist(dtime))
dtimes <- data.frame(Country.Region=rownames(dtimes), dtimes)
rownames(dtimes) <- NULL
dtimes$doublingtime<-dtimes$unlist.dtime.
dtimes2<-dtimes[,-2]




#adding china doubling time

it <- subset(dat, Country.Region=="China")
it<-it[,2]
i.1 <- incidence(it, interval = 1)
fit<-fit_optim_split(i.1)
print(fit$fit$before$info$doubling)
## [1] 2.813805
#data[9,23]<-fit$fit$before$info$doubling ezt meg hozza kell adni
China<-cbind("China", 2.81)
dtimes2<-as.data.frame(rbind(as.matrix(dtimes2), China))
#dtimes2<-as.data.frame(dtimes2[-77,])


#adding south korea
#adding south korea doubling time

it <- subset(dat, Country.Region=="South Korea")
it<-it[,2]
i.1 <- incidence(it, interval = 1)
fit<-fit_optim_split(i.1)
print(fit$fit$before$info$doubling)
#3.659


HAQ <- read.csv("D:/Rprojects/matecovid/HAQ1.txt")
HAQ$year<-HAQ$HAQ.Index..IHME..2017..
HAQ$Country.Region<-HAQ$Year
HAQ$haq<-HAQ$X
HAQ<-subset(HAQ, year==2015)
HAQ<-HAQ[,-c(1:5)]

data<-left_join(dtimes2, HAQ, "Country.Region")

#pop density data
#countries <- fread("http://download.geonames.org/export/dump/countryInfo.txt", skip = "ISO3", na.strings = "")
#countries$popdens<-countries$Population/countries$`Area(in sq km)`
#countries$Country.Region<-countries$Country
#popdens<-countries[, c(20,21,8)]
#write.csv(popdens, "popdens.csv")
popdens <- read_csv("popdens.csv")
popdens<-popdens[2:4]
popdens[234, 2]<-"US"
#gdp data


gdp <- read_csv("gdp25.csv")

moredata<-left_join(popdens, gdp, "Country.Region")
moredata$gdppercap<-moredata$Value/moredata$Population
moredata<-moredata[,-4]

dataa<-left_join(data, moredata, "Country.Region")


#prosperity data

PI <- read_delim("Prosperity Index_2019_Data_szerk.csv", ";", escape_double = FALSE, trim_ws = TRUE)

PI$Country.Region<-factor(PI$Country.Region)

dataa<-left_join(dataa, PI, "Country.Region")
dataa[78,7] <-27608.25 #South Korea # gdppercap  2016according to world #http://datatopics.worldbank.org/world-development-indicators/
dataa[83,7]<-22573 # Taiwan https://countryeconomy.com ??? gdp ??? taiwan
dataa[78,2]<-fit$fit$before$info$doubling #south Korea doubling time

summary(dataa) 
#climatezone
climate.zones <- read.csv("climate zones - Munka1.csv")
climclass<-climate.zones[,c(2,4)]
dataa<-left_join(dataa, climclass, "Country.Region")



tourism.data <- read.csv("tourism data.csv", sep=";")

dataa<-left_join(dataa, tourism.data, "Country.Region")

data<-dataa[,-6]
data$Climateclass<-factor(data$Climateclass)

data$doublingtime<-as.numeric(as.character(data$doublingtime))
data[78, 2]<-fit$fit$before$info$doubling

summary(data)




##clustering

# Prepare Data

datclus <- na.omit(data)
datrf<-datclus[,-1] 
datclus[,-c(1,14)] <- scale(datclus[,-c(1,14)])
datwithnames<-datclus
datclusta<-datclus[,-c(1, 14)]



#modelbased
library(mclust)
set.seed(123456)
mc <- Mclust(datclusta)
#plot(fit) # plot results
summary(mc) # display the best model


mc$modelName                # Optimal selected model ==> "VVI"
mc$G                        # Optimal number of cluster => 4
head(mc$z, 30)              # Probality to belong to a given cluster


library(factoextra)
# BIC values used for choosing the number of clusters
fviz_mclust(mc, "BIC", palette = "jco")
# Classification: plot showing the clustering
fviz_mclust(mc, "classification", geom = "point", 
            pointsize = 1.5, palette = "jco")
# Classification uncertainty
fviz_mclust(mc, "uncertainty", palette = "jco")

datwithnames$cluster<-as.factor(mc$classification)
countrycluster<-datwithnames[,c(1,16)]
countrycluster

data<-left_join(data, countrycluster, "Country.Region")


library(tableone)
vars<-colnames(data[,2:15])
tab<-CreateTableOne(data=data, strata="cluster", vars=vars )
print(tab, nonnormal=vars)
#write.csv(print(tab, nonnormal=vars), "clustertablenew.csv")



#testpermillionpeople



covid19_tests_per_million_people <- read_delim("covid19-tests-per-million-people.csv", ";", escape_double = FALSE, trim_ws = TRUE)

covid19_tests_per_million_people<-covid19_tests_per_million_people[order(-covid19_tests_per_million_people$tetspermillion),]  
covid19_tests_per_million_people

data<-left_join(dataa, covid19_tests_per_million_people, "Country.Region")


ggplot(data, aes(tetspermillion, as.numeric(as.character(doublingtime)), label=Country.Region)) + geom_point() + geom_smooth(method="lm") + geom_text(angle = 45, check_overlap = TRUE) + scale_x_log10() + scale_y_log10()


#goverment stringency index
government_stringency_index <- read_delim("government stringency index.csv", ";", escape_double = FALSE, col_types = cols(Date = col_date(format = "%Y%m%d")), trim_ws = TRUE)

stringency<-government_stringency_index %>%
  group_by(Country.Region) %>%
  summarize(maxstringindex=max(Stringency))
stringency<-na.omit(stringency)
data<-left_join(dataa, stringency, "Country.Region")
summary(data)

data<-left_join(data, countrycluster, "Country.Region")

#other WVS parameters 
WVSextra <- read.csv2("D:/Rprojects/matecovid/WVSextra.csv")
data<-left_join(data, WVSextra, "Country.Region")


clusterone<-subset(data, cluster=="1")
clustertwo<-subset(data, cluster=="2")
clusterthree<-subset(data, cluster=="3")
clusterfour<-subset(data, cluster=="4")

summary(data)
summary(clusterone)
summary(clustertwo)
summary(clusterthree)
summary(clusterfour)



#rfsrc
set.seed(1)
library(randomForestSRC)
datrf[,-13]<-log(datrf[,-13])
dat1<-datrf
summary(dat1)
set.seed(1)
#train<-sample(1:nrow(dat1), round(nrow(dat1)*0.8))
set.seed(1)

tunedgrow<-tune.rfsrc(doublingtime ~. , data = dat1, ntree=100, trace=FALSE, doBest=TRUE)

set.seed(1)
grow<-rfsrc(doublingtime ~. , data = dat1, ntree=100, nodesize=tunedgrow$optimal["nodesize"], mtry=tunedgrow$optimal["mtry"], trace=TRUE, importance=TRUE)
set.seed(1)
#pred<-predict(grow, dat1[-train,])
print(grow)

plot.variable.rfsrc(grow, plots.per.page = 2)

library(ggRandomForests)
set.seed(1)
 md<-gg_minimal_depth(grow)
 md
 plot(md)
 set.seed(1)
mdvimp<-gg_minimal_vimp(grow)
mdvimp
 plot(mdvimp) 
 

set.seed(1)
gg_v <- gg_variable(grow)


plot(gg_v)
p<-plot(gg_v)[[5]]
p+xlab("Freedom of assembly and association (natural logarithm") + ylab("Predicted natural logarithm of doubling time (days)") + scale_y_continuous(trans = "exp")


p<-plot(gg_v)[[8]]
p+xlab("Agency score (natural logarithm") + ylab("Predicted natural logarithm of doubling time (days)")



predicted<-exp(gg_v$yhat)
agency<-exp(gg_v$persfreedomagency)
a<-cbind(predicted, agency)
persfree<-exp(gg_v$PersonalFreedom)
a<-cbind(a, persfree)
civic<-exp(gg_v$Socialcapcivic)
a<-cbind(a, civic)
climate<-factor(gg_v$Climateclass)
a<-cbind(a,climate) 
gdpp<-exp(gg_v$gdppercap)
a<-cbind(a, gdpp)
haq<-exp(gg_v$haq)
a<-cbind(a, haq)
social<-exp(gg_v$Socialcapsocialnetwork)
a<-cbind(a, social)
family<-exp(gg_v$Socialcappersonalandfamily)
a<-cbind(a, family)
popdens<-exp(gg_v$popdens)
a<-as.data.frame(cbind(a, popdens))





p2<-ggplot(a, aes(agency, predicted)) + geom_point() + geom_smooth() + xlab("Agency score") + ylab("Predicted doubling time (days)")
p2



p3<-ggplot(a, aes(persfree, predicted)) + geom_point() + geom_smooth() + xlab("Freedom of assembly and association") + ylab("Predicted doubling time (days)")
p3

p1<-ggplot(a, aes(popdens, predicted)) + geom_point() + geom_smooth() + xlab("Population density") + ylab("Predicted doubling time (days)") + scale_x_log10()
p1

p4<-ggplot(a, aes(civic, predicted)) + geom_point() + geom_smooth() + xlab("Civic and social participation") + ylab("Predicted doubling time (days)")
p4

p5<-ggplot(a, aes(factor(climate), predicted)) + geom_point() + geom_boxplot() + xlab("Climate class") + ylab("Predicted doubling time (days)") + theme(axis.text.x = element_text(angle = 45))
p5

p6<-ggplot(a, aes(gdpp, predicted)) + geom_point() + geom_smooth() + xlab("GDP per capita") + ylab("Predicted doubling time (days)")
p6

p7<-ggplot(a, aes(social, predicted)) + geom_point() + geom_smooth() + xlab("Social network") + ylab("Predicted doubling time (days)")
p7

p8<-ggplot(a, aes(family, predicted)) + geom_point() + geom_smooth() + xlab("Personal and family relationships") + ylab("Predicted doubling time (days)")
p8

p9<-ggplot(a, aes(haq, predicted)) + geom_point() + geom_smooth() + xlab("HAQ index") + ylab("Predicted doubling time (days)")
p9

 library(ggpubr)
 figure<-ggarrange(p1,p3,p2,p4, p5, p6,p7, p8, p9)
 figure
pdf("matecovidfigfinalfig.pdf", width = 10, height = 10)
figure
dev.off()

```
